#Loading Libraries

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmboot)
library(lattice)
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
library(caret)
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(naniar)
library(utils)
library(stats)

##Reading in Dataset

setwd("/Users/xaviermojica/Desktop/Stats2/Project1") #/Users/xaviermojica/
life = read.csv("Life Expectancy Data (1).csv")
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).

##Upon looking at the graph of the original data set, it appears that there needs to be a log transformation on the X or the GDP as we are interested in seeing the relation between Life Expenctancy and GDP.

##Checking Data Types

str(life)
## 'data.frame':    2938 obs. of  22 variables:
##  $ Country                        : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : int  2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
##  $ Status                         : chr  "Developing" "Developing" "Developing" "Developing" ...
##  $ Life.expectancy                : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality                : int  263 271 268 272 275 279 281 287 295 295 ...
##  $ infant.deaths                  : int  62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure         : num  71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis.B                    : int  65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : int  1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
##  $ BMI                            : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths              : int  83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : int  6 58 62 67 68 66 63 64 63 58 ...
##  $ Total.expenditure              : num  8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : int  65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num  584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num  33736494 327582 31731688 3696958 2978599 ...
##  $ thinness..1.19.years           : num  17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness.5.9.years             : num  17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income.composition.of.resources: num  0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

dim(life)
## [1] 2938   22
View(life)
#sum(is.na(life))

#Imputing using Median

#GDP 15% [17]  keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.  
#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:4, 9 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938   22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#Literature says that over 10% missing data can contribute to bias 
#HepatitsB [9] at 19% , Population 22% [18]. 
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))

#Converting GDP to Log 
cleandataMedian$logGDP = log(cleandataMedian$GDP)
#converting Life.expectancy to log 
cleandataMedian$logLife.expectancy = log(cleandataMedian$Life.expectancy)

#Log transformation on GDP 
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = logGDP, y = logLife.expectancy))

#Split Data

set.seed(1234)
trainIndex<-createDataPartition(cleandataMedian$Life.expectancy,p=.8,list=F)

training<-cleandataMedian[trainIndex,]
validate<-cleandataMedian[-trainIndex,]

#Multivariable Plots on Full Data

library(ISLR)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(GGally)
library(ggplot2)

ggpairs(training[,4:8], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag",  color = "blue", alpha = 0.5)))

ggpairs(training[,c(4,9:12)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag",  color = "blue", alpha = 0.5)))

ggpairs(training[,c(4,13:16)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag",  color = "blue", alpha = 0.5)))

ggpairs(training[,c(4,17:20)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag",  color = "blue", alpha = 0.5)))

#ggpairs(cleandataMedian[,5:22], upper = list(continuous = wrap("cor", size = 4.75, align_percent = 1)))
#ggscatmat(cleandataMedian, columns = 4:10)

#Residuals first model

set.seed(2345)
eightVar = lm(Life.expectancy~HIV.AIDS+Schooling+Alcohol+BMI+Polio+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(eightVar)
## 
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + Alcohol + 
##     BMI + Polio + Diphtheria + logGDP + thinness..1.19.years + 
##     Income.composition.of.resources, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.4141  -2.5857   0.1184   2.7623  19.2743 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     43.206803   0.625395  69.087  < 2e-16 ***
## HIV.AIDS                        -0.681485   0.019905 -34.236  < 2e-16 ***
## Schooling                        0.807629   0.055413  14.575  < 2e-16 ***
## Alcohol                          0.048487   0.030055   1.613    0.107    
## BMI                              0.055012   0.006301   8.731  < 2e-16 ***
## Polio                            0.029326   0.005694   5.151 2.82e-07 ***
## Diphtheria                       0.048879   0.005683   8.601  < 2e-16 ***
## logGDP                           0.579798   0.070900   8.178 4.69e-16 ***
## thinness..1.19.years            -0.113199   0.027585  -4.104 4.21e-05 ***
## Income.composition.of.resources  7.810187   0.825457   9.462  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.657 on 2334 degrees of freedom
## Multiple R-squared:  0.7607, Adjusted R-squared:  0.7598 
## F-statistic: 824.6 on 9 and 2334 DF,  p-value: < 2.2e-16
confint(eightVar)
##                                       2.5 %      97.5 %
## (Intercept)                     41.98041532 44.43319143
## HIV.AIDS                        -0.72051911 -0.64245093
## Schooling                        0.69896506  0.91629379
## Alcohol                         -0.01045042  0.10742491
## BMI                              0.04265642  0.06736755
## Polio                            0.01816053  0.04049132
## Diphtheria                       0.03773517  0.06002214
## logGDP                           0.44076547  0.71883070
## thinness..1.19.years            -0.16729240 -0.05910629
## Income.composition.of.resources  6.19148283  9.42889208
#Visuals for Residuals
plot(eightVar)

#Best final MLR

set.seed(2323)
lessvar = lm(Life.expectancy~HIV.AIDS+Schooling+BMI+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(lessvar)
## 
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + BMI + Diphtheria + 
##     logGDP + thinness..1.19.years + Income.composition.of.resources, 
##     data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.382  -2.545   0.095   2.768  20.119 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     43.642032   0.618828  70.524  < 2e-16 ***
## HIV.AIDS                        -0.680044   0.019876 -34.215  < 2e-16 ***
## Schooling                        0.844081   0.054243  15.561  < 2e-16 ***
## BMI                              0.056115   0.006332   8.862  < 2e-16 ***
## Diphtheria                       0.067500   0.004437  15.211  < 2e-16 ***
## logGDP                           0.599579   0.070806   8.468  < 2e-16 ***
## thinness..1.19.years            -0.123684   0.026922  -4.594 4.58e-06 ***
## Income.composition.of.resources  7.952176   0.829769   9.584  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.684 on 2336 degrees of freedom
## Multiple R-squared:  0.7578, Adjusted R-squared:  0.757 
## F-statistic:  1044 on 7 and 2336 DF,  p-value: < 2.2e-16
confint(lessvar)
##                                       2.5 %      97.5 %
## (Intercept)                     42.42852263 44.85554107
## HIV.AIDS                        -0.71901981 -0.64106761
## Schooling                        0.73771169  0.95044987
## BMI                              0.04369781  0.06853138
## Diphtheria                       0.05879867  0.07620224
## logGDP                           0.46073069  0.73842704
## thinness..1.19.years            -0.17647798 -0.07089008
## Income.composition.of.resources  6.32501538  9.57933583
#Visuals for Residuals
plot(lessvar)

#Forward, Backward, Stepwise Selection

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#Full Model 
set.seed(1246)
fitFull = lm(Life.expectancy~Adult.Mortality + 
     infant.deaths + Alcohol + percentage.expenditure + Measles + 
     BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP,training)

stepup = stepAIC(fitFull, direction = "forward", steps = 2000)
## Start:  AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
stepdown= stepAIC(fitFull, direction = "backward", steps = 2000)
## Start:  AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## - thinness.5.9.years               1       9.3 38373 6584.6
## <none>                                         38364 6586.1
## - thinness..1.19.years             1      64.6 38428 6588.0
## - Measles                          1      85.5 38449 6589.3
## - Total.expenditure                1     115.3 38479 6591.1
## - percentage.expenditure           1     209.3 38573 6596.8
## - Alcohol                          1     227.5 38591 6597.9
## - Polio                            1     360.2 38724 6606.0
## - logGDP                           1     488.4 38852 6613.7
## - Diphtheria                       1     869.0 39233 6636.6
## - BMI                              1     884.3 39248 6637.5
## - Income.composition.of.resources  1     997.9 39362 6644.3
## - infant.deaths                    1    1914.8 40279 6698.2
## - under.five.deaths                1    1950.0 40314 6700.3
## - Schooling                        1    3043.1 41407 6763.0
## - HIV.AIDS                         1    8282.4 46646 7042.3
## - Adult.Mortality                  1    9123.8 47488 7084.2
## 
## Step:  AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## <none>                                         38373 6584.6
## - Measles                          1      88.3 38461 6588.0
## - Total.expenditure                1     111.9 38485 6589.5
## - thinness..1.19.years             1     125.1 38498 6590.3
## - percentage.expenditure           1     209.3 38582 6595.4
## - Alcohol                          1     225.6 38599 6596.4
## - Polio                            1     360.1 38733 6604.5
## - logGDP                           1     482.6 38856 6611.9
## - Diphtheria                       1     873.4 39246 6635.4
## - BMI                              1     875.2 39248 6635.5
## - Income.composition.of.resources  1     998.5 39372 6642.9
## - infant.deaths                    1    1925.1 40298 6697.4
## - under.five.deaths                1    1955.5 40329 6699.2
## - Schooling                        1    3052.4 41425 6762.1
## - HIV.AIDS                         1    8273.2 46646 7040.3
## - Adult.Mortality                  1    9114.8 47488 7082.2
stepboth = stepAIC(fitFull, direction = "both", steps = 2000)
## Start:  AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## - thinness.5.9.years               1       9.3 38373 6584.6
## <none>                                         38364 6586.1
## - thinness..1.19.years             1      64.6 38428 6588.0
## - Measles                          1      85.5 38449 6589.3
## - Total.expenditure                1     115.3 38479 6591.1
## - percentage.expenditure           1     209.3 38573 6596.8
## - Alcohol                          1     227.5 38591 6597.9
## - Polio                            1     360.2 38724 6606.0
## - logGDP                           1     488.4 38852 6613.7
## - Diphtheria                       1     869.0 39233 6636.6
## - BMI                              1     884.3 39248 6637.5
## - Income.composition.of.resources  1     997.9 39362 6644.3
## - infant.deaths                    1    1914.8 40279 6698.2
## - under.five.deaths                1    1950.0 40314 6700.3
## - Schooling                        1    3043.1 41407 6763.0
## - HIV.AIDS                         1    8282.4 46646 7042.3
## - Adult.Mortality                  1    9123.8 47488 7084.2
## 
## Step:  AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## <none>                                         38373 6584.6
## + thinness.5.9.years               1       9.3 38364 6586.1
## - Measles                          1      88.3 38461 6588.0
## - Total.expenditure                1     111.9 38485 6589.5
## - thinness..1.19.years             1     125.1 38498 6590.3
## - percentage.expenditure           1     209.3 38582 6595.4
## - Alcohol                          1     225.6 38599 6596.4
## - Polio                            1     360.1 38733 6604.5
## - logGDP                           1     482.6 38856 6611.9
## - Diphtheria                       1     873.4 39246 6635.4
## - BMI                              1     875.2 39248 6635.5
## - Income.composition.of.resources  1     998.5 39372 6642.9
## - infant.deaths                    1    1925.1 40298 6697.4
## - under.five.deaths                1    1955.5 40329 6699.2
## - Schooling                        1    3052.4 41425 6762.1
## - HIV.AIDS                         1    8273.2 46646 7040.3
## - Adult.Mortality                  1    9114.8 47488 7082.2
#summary for each model 
up = summary(stepup)
up
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0294  -2.1028   0.0757   2.3710  16.0441 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.188e+01  6.977e-01  74.359  < 2e-16 ***
## Adult.Mortality                 -2.091e-02  8.887e-04 -23.525  < 2e-16 ***
## infant.deaths                    9.970e-02  9.251e-03  10.777  < 2e-16 ***
## Alcohol                          1.017e-01  2.737e-02   3.715 0.000208 ***
## percentage.expenditure           1.944e-04  5.455e-05   3.563 0.000374 ***
## Measles                         -1.934e-05  8.491e-06  -2.277 0.022848 *  
## BMI                              4.111e-02  5.613e-03   7.324 3.31e-13 ***
## under.five.deaths               -7.429e-02  6.831e-03 -10.876  < 2e-16 ***
## Polio                            2.327e-02  4.977e-03   4.674 3.12e-06 ***
## Total.expenditure                9.986e-02  3.776e-02   2.644 0.008237 ** 
## Diphtheria                       3.639e-02  5.012e-03   7.260 5.25e-13 ***
## HIV.AIDS                        -4.473e-01  1.996e-02 -22.414  < 2e-16 ***
## thinness..1.19.years            -1.103e-01  5.568e-02  -1.980 0.047825 *  
## thinness.5.9.years               4.121e-02  5.495e-02   0.750 0.453338    
## Income.composition.of.resources  5.677e+00  7.297e-01   7.780 1.08e-14 ***
## Schooling                        6.622e-01  4.874e-02  13.586  < 2e-16 ***
## logGDP                           3.792e-01  6.968e-02   5.443 5.80e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.06 on 2327 degrees of freedom
## Multiple R-squared:  0.8186, Adjusted R-squared:  0.8174 
## F-statistic: 656.5 on 16 and 2327 DF,  p-value: < 2.2e-16
down = summary(stepdown)
down
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0079  -2.1041   0.0671   2.3681  16.0263 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.193e+01  6.942e-01  74.809  < 2e-16 ***
## Adult.Mortality                 -2.089e-02  8.883e-04 -23.515  < 2e-16 ***
## infant.deaths                    9.992e-02  9.246e-03  10.807  < 2e-16 ***
## Alcohol                          1.012e-01  2.736e-02   3.700 0.000221 ***
## percentage.expenditure           1.944e-04  5.454e-05   3.564 0.000373 ***
## Measles                         -1.963e-05  8.482e-06  -2.314 0.020753 *  
## BMI                              4.061e-02  5.573e-03   7.287 4.32e-13 ***
## under.five.deaths               -7.439e-02  6.830e-03 -10.892  < 2e-16 ***
## Polio                            2.326e-02  4.977e-03   4.674 3.12e-06 ***
## Total.expenditure                9.821e-02  3.769e-02   2.606 0.009231 ** 
## Diphtheria                       3.647e-02  5.010e-03   7.279 4.57e-13 ***
## HIV.AIDS                        -4.469e-01  1.995e-02 -22.403  < 2e-16 ***
## thinness..1.19.years            -7.361e-02  2.672e-02  -2.755 0.005920 ** 
## Income.composition.of.resources  5.679e+00  7.296e-01   7.783 1.05e-14 ***
## Schooling                        6.630e-01  4.872e-02  13.608  < 2e-16 ***
## logGDP                           3.765e-01  6.957e-02   5.411 6.91e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared:  0.8186, Adjusted R-squared:  0.8174 
## F-statistic: 700.3 on 15 and 2328 DF,  p-value: < 2.2e-16
both = summary(stepboth)
both
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0079  -2.1041   0.0671   2.3681  16.0263 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.193e+01  6.942e-01  74.809  < 2e-16 ***
## Adult.Mortality                 -2.089e-02  8.883e-04 -23.515  < 2e-16 ***
## infant.deaths                    9.992e-02  9.246e-03  10.807  < 2e-16 ***
## Alcohol                          1.012e-01  2.736e-02   3.700 0.000221 ***
## percentage.expenditure           1.944e-04  5.454e-05   3.564 0.000373 ***
## Measles                         -1.963e-05  8.482e-06  -2.314 0.020753 *  
## BMI                              4.061e-02  5.573e-03   7.287 4.32e-13 ***
## under.five.deaths               -7.439e-02  6.830e-03 -10.892  < 2e-16 ***
## Polio                            2.326e-02  4.977e-03   4.674 3.12e-06 ***
## Total.expenditure                9.821e-02  3.769e-02   2.606 0.009231 ** 
## Diphtheria                       3.647e-02  5.010e-03   7.279 4.57e-13 ***
## HIV.AIDS                        -4.469e-01  1.995e-02 -22.403  < 2e-16 ***
## thinness..1.19.years            -7.361e-02  2.672e-02  -2.755 0.005920 ** 
## Income.composition.of.resources  5.679e+00  7.296e-01   7.783 1.05e-14 ***
## Schooling                        6.630e-01  4.872e-02  13.608  < 2e-16 ***
## logGDP                           3.765e-01  6.957e-02   5.411 6.91e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared:  0.8186, Adjusted R-squared:  0.8174 
## F-statistic: 700.3 on 15 and 2328 DF,  p-value: < 2.2e-16
fitFull
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP, data = training)
## 
## Coefficients:
##                     (Intercept)                  Adult.Mortality  
##                       5.188e+01                       -2.091e-02  
##                   infant.deaths                          Alcohol  
##                       9.970e-02                        1.017e-01  
##          percentage.expenditure                          Measles  
##                       1.944e-04                       -1.934e-05  
##                             BMI                under.five.deaths  
##                       4.111e-02                       -7.429e-02  
##                           Polio                Total.expenditure  
##                       2.327e-02                        9.986e-02  
##                      Diphtheria                         HIV.AIDS  
##                       3.639e-02                       -4.473e-01  
##            thinness..1.19.years               thinness.5.9.years  
##                      -1.103e-01                        4.121e-02  
## Income.composition.of.resources                        Schooling  
##                       5.677e+00                        6.622e-01  
##                          logGDP  
##                       3.792e-01
olsrr::ols_step_forward_aic(fitFull)
## 
##                                        Selection Summary                                         
## ------------------------------------------------------------------------------------------------
## Variable                              AIC         Sum Sq         RSS         R-Sq      Adj. R-Sq 
## ------------------------------------------------------------------------------------------------
## Schooling                          15530.008    108310.949    103218.899    0.51204      0.51183 
## Adult.Mortality                    14375.533    148508.373     63021.474    0.70207      0.70181 
## HIV.AIDS                           13971.630    158528.968     53000.879    0.74944      0.74912 
## Diphtheria                         13721.013    163943.928     47585.920    0.77504      0.77465 
## BMI                                13591.942    166531.793     44998.054    0.78727      0.78682 
## Income.composition.of.resources    13493.362    168421.796     43108.051    0.79621      0.79568 
## logGDP                             13432.779    169557.522     41972.325    0.80158      0.80098 
## Polio                              13408.221    170030.391     41499.456    0.80381      0.80314 
## thinness..1.19.years               13387.220    170435.632     41094.216    0.80573      0.80498 
## Measles                            13372.433    170728.879     40800.969    0.80711      0.80629 
## percentage.expenditure             13360.849    170964.653     40565.194    0.80823      0.80732 
## Total.expenditure                  13353.576    171124.817     40405.030    0.80899      0.80800 
## Alcohol                            13351.222    171199.787     40330.061    0.80934      0.80828 
## ------------------------------------------------------------------------------------------------

#KNN

#library(caret)

#fit_cont = trainControl(method = "repeatedcv", number = 10, repeats = 1)
#set.seed(136)

#knnfit = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =cleandataMedian, method = "knn", trControl = fit_cont, tuneGrid = expand.grid(k = c(1:30)))

#knnfit

#plot(knnfit)

#knn with training

library(caret)
fit_cont1 = trainControl(method = "repeatedcv", number = 10, repeats = 1)

set.seed(1364)

knnfit1 = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =training, method = "knn", trControl = fit_cont1, tuneGrid = expand.grid(k = c(1:30)))


plot(knnfit1)

updateval = validate[,c("Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria", "HIV.AIDS", "thinness..1.19.years","thinness.5.9.years",  "Income.composition.of.resources", "Schooling", "logGDP")]


prediction = predict(knnfit1, newdata = updateval)

MSPE = data.frame(Observed = validate$Life.expectancy, Predicted = prediction)

MSPE$Residual = MSPE$Observed - MSPE$Predicted

MSPE$SquaredResidual = MSPE$Residual^2

sqrt(mean(MSPE$SquaredResidual))
## [1] 4.840354
#prediction = predict(knnfit1, newdata = validate)
#cf = confusionMatrix(prediction, updateval$Life.expectancy)
#cf
#print(cf)

#World Map, Color plotting

#Creating the World 
#library(ggplot2)
#library(tidyverse)
#library(ggthemes)

#world_map = map_data("world") %>% filter(! long > 180)

#countries = world_map %>% distinct(region) %>% rowid_to_column()

#countries %>% ggplot(aes(fill = rowid, map_id = region)) + geom_map(map = world_map) + expand_limits(x = world_map$long, y = world_map$lat) + coord_map("moll") +theme_map()

#Color world plotting with full data

library(ggplot2)
library(tidyverse)

#rename training for the map
dataforcolmap = training
#but first renaming column Country in dataforcolmap 
colnames(dataforcolmap)[1] = "region"



#Renaming United States of America and Boliva (Republic...) with USA and Boliva and other countries as follows 
dataforcolmap$region[dataforcolmap$region == "United States of America"] = "USA"
dataforcolmap$region[dataforcolmap$region == "Bolivia (Plurinational State of)"] = "Bolivia"
dataforcolmap$region[dataforcolmap$region == "Venezuela (Bolivarian Republic of)"] = "Venezuela"
dataforcolmap$region[dataforcolmap$region == "Republic of Korea"] = "South Korea"
dataforcolmap$region[dataforcolmap$region == "The former Yugoslav republic of Macedonia"] = "North Macedonia"
dataforcolmap$region[dataforcolmap$region == "Republic of Moldova"] = "Moldova"
dataforcolmap$region[dataforcolmap$region == "Russian Federation"] = "Russia"
dataforcolmap$region[dataforcolmap$region == "Micronesia (Federated States of)"] = "Micronesia"
dataforcolmap$region[dataforcolmap$region == "Lao People's Democratic Republic"] = "Laos"
dataforcolmap$region[dataforcolmap$region == "Iran (Islamic Republic of)"] = "Iran"
dataforcolmap$region[dataforcolmap$region == "Democratic People's Republic of Korea"] = "North Korea"


view(dataforcolmap)


#getting map data for plotting 
mapdata = map_data("world")
#view(mapdata)

#joining map data with dataforcolmap 
mapdata = left_join(mapdata,dataforcolmap, by = "region")
## Warning in left_join(mapdata, dataforcolmap, by = "region"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 11 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#view(mapdata)

#filtering out NAs for life expectancy , status, Income.composition.of.resources
#Life Exp
mapdata1 = mapdata %>% filter(!is.na(mapdata$Life.expectancy))

#Status 
mapdata2 = mapdata %>% filter(!is.na(mapdata$Status))

#Income
mapdata3 = mapdata %>% filter(!is.na(mapdata$Income.composition.of.resources))


#mapping mapdata1 for Life Exp 

map1 = ggplot(mapdata1, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Life.expectancy), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Life Expectancy per Country") + scale_fill_gradient(low = "red", high = "yellow") + guides(fill=guide_legend(title="Life Expectancy"))
map1

#mapping mapdata2 for Status 

mapStatus = ggplot(mapdata2, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Status, col = "orange"), color = "black") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Country's Status: Developed v. Developing") 
mapStatus

#mapping mapdata3 for Income Composition of Resources 

mapIncome = ggplot(mapdata3, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Income.composition.of.resources), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Income Composition of Resources per Country") + scale_fill_gradient(low = "red", high = "yellow")+ guides(fill=guide_legend(title="Income Composition of Resources"))
mapIncome

#World Map with Best Model


# Using rpart library
#treeimb <- rpart(ExplVar ~ ., data = train)
#pred.treeimb <- predict(treeimb, newdata = test)